Data Import
concrete = read_csv("concrete.csv") %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## Cement = col_double(),
## BlastFurnaceSlag = col_double(),
## FlyAsh = col_double(),
## Water = col_double(),
## Superplasticizer = col_double(),
## CoarseAggregate = col_double(),
## FineAggregate = col_double(),
## Age = col_integer(),
## CompressiveStrength = col_double()
## )
# matrix of predictors
x <- model.matrix(compressive_strength~.,concrete)[,-1]
# vector of response
y <- concrete$compressive_strength
theme1 <- trellis.par.get()
theme1$plot.symbol$col <- rgb(.2, .4, .2, .5)
theme1$plot.symbol$pch <- 16
theme1$plot.line$col <- rgb(.8, .1, .1, 1)
theme1$plot.line$lwd <- 2
theme1$strip.background$col <- rgb(.0, .2, .6, .2)
trellis.par.set(theme1)
featurePlot(x, y, plot = "scatter", labels = c("","compressive strength"),
type = c("p"), layout = c(4, 2))
#polynomial regression
fit1 <- lm(compressive_strength~water, data = concrete)
fit2 <- lm(compressive_strength~poly(water,2), data = concrete)
fit3 <- lm(compressive_strength~poly(water,3), data = concrete)
fit4 <- lm(compressive_strength~poly(water,4), data = concrete)
set.seed(1)
mseK <- rep(NA, 4)
for (i in 1:4) {
fit <- glm(compressive_strength ~ poly(water, i), data = concrete)
mseK[i] <- cv.glm(concrete, fit, K = 10)$delta[1]
}
plot(1:4, mseK, xlab = "Exponent", ylab = "Test MSE", type = "l")
The cross validation approach produced models with very similar RMSE values. While models with the exponent of 2 and 3 for water are the lowest they aren’t by much. With that being said, the r squared value for the exponent 4 is the highest indicating that the differences in compressive_strength is more explained by water^4 than any other polynomial function. Using the for loop, the cross validation provided an exponent of 4 as the best polynomial function. Therefore lets confirm using the anova test.
anova(fit1,fit2,fit3,fit4)
## Analysis of Variance Table
##
## Model 1: compressive_strength ~ water
## Model 2: compressive_strength ~ poly(water, 2)
## Model 3: compressive_strength ~ poly(water, 3)
## Model 4: compressive_strength ~ poly(water, 4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1028 263085
## 2 1027 247712 1 15372.8 68.140 4.652e-16 ***
## 3 1026 235538 1 12174.0 53.962 4.166e-13 ***
## 4 1025 231246 1 4291.5 19.022 1.423e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot1 <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
geom_point(color = rgb(.2, .4, .2, .5))
plot1
Using the anova test, models with an exponent of 2, 3, and 4 are significant. However, lets use d=2 since we want the most parsimonious model.
plot(compressive_strength ~ water, data = concrete, col = "blue")
waterlims <- range(concrete$water)
water.grid <- seq(from = waterlims[1], to = waterlims[2], by = 1)
preds1 <- predict(fit1, newdata = data.frame(water = water.grid))
preds2 <- predict(fit2, newdata = data.frame(water = water.grid))
preds3 <- predict(fit3, newdata = data.frame(water = water.grid))
preds4 <- predict(fit4, newdata = data.frame(water = water.grid))
lines(water.grid, preds1, col = "red", lwd = 2)
lines(water.grid, preds2, col = "purple", lwd = 2)
lines(water.grid, preds3, col = "green", lwd = 2)
lines(water.grid, preds4, col = "brown", lwd = 2)
plot(compressive_strength ~ water, data = concrete, col = "red")
waterlims <- range(concrete$water)
water.grid <- seq(from = waterlims[1], to = waterlims[2], by = 1)
fit <- lm(compressive_strength ~ poly(water, 4), data = concrete)
preds <- predict(fit, newdata = data.frame(water = water.grid))
lines(water.grid, preds, col = "blue", lwd = 2)
fit.ss <- smooth.spline(concrete$water, concrete$compressive_strength)
fit.ss$df
## [1] 68.88205
pred.ss <- predict(fit.ss, x = water.grid)
pred.ss.df <- data.frame(pred = pred.ss$y,
water = water.grid)
p <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
geom_point(color = rgb(.2, .4, .2, .5))
p + geom_line(aes(x = water, y = pred), data = pred.ss.df,
color = rgb(.8, .1, .1, 1)) + theme_bw() +
labs(title = 'Select degrees of freedom using GCV') +
theme(plot.title = element_text(hjust = 0.5))
Here we will fit for different degrees of freedom for 2 to 10.
par(mfrow = c(3,3)) # 3 x 3 grid
all.dfs = rep(NA, 9)
for (i in 2:10) {
fit.ss = smooth.spline(concrete$water, concrete$compressive_strength, df = i)
pred.ss <- predict(fit.ss, x = water.grid)
plot(concrete$water, concrete$compressive_strength, cex = .5, col = "darkgrey")
title(paste("Degrees of freedom = ", round(fit.ss$df)), outer = F)
lines(water.grid, pred.ss$y, lwd = 2, col = "blue")
}
#Use cross validation to find degrees of freedom
fit.ss <- smooth.spline(concrete$water, concrete$compressive_strength)
fit.ss$df
## [1] 68.88205
pred.smooth <- predict(fit.ss, x = water.grid)
pred.smooth.df <- data.frame(pred = pred.ss$y,
water = water.grid)
fitplot <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
geom_point(color = rgb(.2, .2, .4, .1)) + geom_line(aes(x = water, y = pred), data = pred.smooth.df,
color = rgb(.8, .1, .1, 1)) + theme_bw()
fitplot
#fit 1
fit.ss1 <- smooth.spline(concrete$water, concrete$compressive_strength, df = 10)
fit.ss1
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength,
## df = 10)
##
## Smoothing Parameter spar= 0.8687676 lambda= 0.001391258 (11 iterations)
## Equivalent Degrees of Freedom (Df): 10.00132
## Penalized Criterion (RSS): 81156.22
## GCV: 221.8629
pred.ss1 <- predict(fit.ss1, x = water.grid)
pred.ss.df1 <- data.frame(pred = pred.ss1$y, water = water.grid)
#fit 2
fit.ss2 <- smooth.spline(concrete$water, concrete$compressive_strength, df = 30)
fit.ss2
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength,
## df = 30)
##
## Smoothing Parameter spar= 0.5500421 lambda= 6.929549e-06 (11 iterations)
## Equivalent Degrees of Freedom (Df): 30.00244
## Penalized Criterion (RSS): 59933.93
## GCV: 208.9676
pred.ss2 <- predict(fit.ss2, x = water.grid)
pred.ss.df2 <- data.frame(pred = pred.ss2$y, water = water.grid)
#fit 3
fit.ss3 <- smooth.spline(concrete$water, concrete$compressive_strength, df = 50)
fit.ss3
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength,
## df = 50)
##
## Smoothing Parameter spar= 0.3836017 lambda= 4.347305e-07 (11 iterations)
## Equivalent Degrees of Freedom (Df): 50.00509
## Penalized Criterion (RSS): 46068.79
## GCV: 202.715
pred.ss3 <- predict(fit.ss3, x = water.grid)
pred.ss.df3 <- data.frame(pred = pred.ss3$y, water = water.grid)
#cv fit
fit.ss <- smooth.spline(concrete$water, concrete$compressive_strength)
fit.ss
## Call:
## smooth.spline(x = concrete$water, y = concrete$compressive_strength)
##
## Smoothing Parameter spar= 0.2578404 lambda= 5.371698e-08 (11 iterations)
## Equivalent Degrees of Freedom (Df): 68.88205
## Penalized Criterion (RSS): 36681.64
## GCV: 200.2892
pred.ss <- predict(fit.ss, x = water.grid)
pred.ss.df <- data.frame(pred = pred.ss$y, water = water.grid)
#plot of fits
scatter <- ggplot(data = concrete, aes(x = water, y = compressive_strength)) +
geom_point(color = rgb(.2, .4, .2, .5))
smooth_fits <- scatter +
geom_line(aes(x = water, y = pred), data = pred.ss.df1,
color = rgb(.8, .1, .1, 1)) +
geom_line(aes(x = water, y = pred), data = pred.ss.df2,
color = rgb(0, 0, 1, 1)) +
geom_line(aes(x = water, y = pred), data = pred.ss.df3,
color = rgb(1, 0, 1, 1)) +
geom_line(aes(x = water, y = pred), data = pred.ss.df,
color = rgb(.2, .2, .4, 1)) +
theme_bw()
smooth_fits
gam.m1 <- gam(compressive_strength ~ cement + blast_furnace_slag + fly_ash + water + superplasticizer + coarse_aggregate + fine_aggregate + age, data = concrete)
gam.m2 <- gam(compressive_strength ~ cement + blast_furnace_slag + fly_ash + s(water) + superplasticizer + coarse_aggregate + fine_aggregate + age, data = concrete)
anova(gam.m1, gam.m2, test = "F")
## Analysis of Deviance Table
##
## Model 1: compressive_strength ~ cement + blast_furnace_slag + fly_ash +
## water + superplasticizer + coarse_aggregate + fine_aggregate +
## age
## Model 2: compressive_strength ~ cement + blast_furnace_slag + fly_ash +
## s(water) + superplasticizer + coarse_aggregate + fine_aggregate +
## age
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 1021.0 110413
## 2 1013.4 106140 7.5562 4272.8 5.4038 2.01e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(gam.m2)
vis.gam(gam.m2, view = c("water", "fine_aggregate"),
plot.type = "contour", color = "topo")
vis.gam(gam.m2, view = c("water","coarse_aggregate"),
plot.type = "contour", color = "topo")